/*

	Purpose: Bring Preston and Haines occupations/income scores and collapse them 
	         into ANES occupations.
	Output: 1_PH1901_incomescores.dta (occupations are coarsened)

*/

clear
set more off
cd "$Mydirectory1/1_DataSources/1900_IncomeScores/"
 
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

*** Occupation counts in 1900 Census 

use ../CensusData/input/Census1900_full_fathers30to50.dta, clear //download from IPUMS USA

	gen number=1 
	collapse (sum) number, by(occ1950)
	
	tempfile counts
	save `counts'

*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

*****************
*** NOW BRING IN PRESTON AND HAINES DATA AND COLLAPSE INTO ANES OCCUPATIONS
*****************

* Bring in PH income scores
	use ./input/Occwage_PrestonHaines.dta, clear 
	drop if occwage_PH_all==.

* Merge with population counts 
	merge 1:1 occ1950 using `counts'
	replace number=0 if _merge==1
	drop if _merge==2
	drop _merge

* Calculate average income across all occupations 
	gen all=1 

	preserve
	
	collapse (mean) occwage_PH_all_imp [pw=number], by(all)
	
	rename occwage_PH_all_imp income_all
	
	tempfile all 
	save `all'
	
	restore 
	
	
* Merge in ANES occupations
	merge m:1 occ1950 using ../Crosswalks/Crosswalk_1950Census_toANES.dta
	
		replace occ1950ej=99 if occ1950>970& occ1950<=999
		
		drop if _merge==2 
		drop _merge

* Calculate average income by coarsened ANES occupation. Weight by number of men in each occupation

	bysort occ1950ej: egen total_pop = sum(number)
	gen share = number / total_pop
		
	collapse (rawsum) number (mean) occwage_PH_all occwage_PH_all_imp all [pw=share], by(occ1950ej)
	
	tab occ1950ej if occwage_PH_all!=occwage_PH_all_imp //note: farming occs (71 and 81) are imputed
	
* Rename and save
	rename occwage_PH_all occscore_PH
	rename occwage_PH_all_imp occscore_PH_farmfix
	rename occ1950ej fatheroccej
	
* Merge in overall income
	merge m:1 all using `all', assert(3) nogen
	
	tempfile ph
	save `ph'
	
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

***************************
*** APPEND PRESTON & HAINES WITH RATIOS BY OCCUPATION X RACE X SOUTH
***************************

* Make template with all possible cells 
	//Note: Ratios of income across levels was created in the NBER server.

	use `ph', clear 

	expand 2, gen(race)
	bysort fatheroccej: replace race = _n
	
	expand 4, gen(region)
	bysort fatheroccej race: replace region = _n
	gen south_merge= region==3
	
	expand 5, gen(edu)
	bysort fatheroccej race region: replace edu = _n
	
	tempfile template
	save `template'

* Merge in ratios
	
	merge m:1 fatheroccej race south_merge edu using ../CensusData/output/Ratios_1940_all_south_levels.dta
	assert fatheroccej==21 if _merge==2
	drop if _merge==2
	drop _merge
	
	merge m:1 fatheroccej race region using ../CensusData/output/Ratios_1940_all_region.dta
	assert fatheroccej==21 if _merge==2
	drop if _merge==2
	drop _merge

	
* Fill in ratios that do exist but are showing up as missing (not in Census data)

	foreach x in incwage hh_income {

	//occ x race
	bysort fatheroccej race: egen temp = max(ratio_`x'_byrace)
	replace ratio_`x'_byrace=temp if ratio_`x'_byrace==.
	drop temp

	//occ x race x south 
	bysort fatheroccej race south_merge: egen temp = max(ratio_`x'_byr_bys)
	replace ratio_`x'_byr_bys=temp if ratio_`x'_byr_bys==.
	drop temp
	
	//occ x south 	
	bysort fatheroccej south_merge: egen temp = max(ratio_`x'_bysouth)
	replace ratio_`x'_bysouth=temp if ratio_`x'_bysouth==.
	drop temp
	
	// just race 
	bysort race: egen temp = max(ratio_`x'_just_race)
	replace ratio_`x'_just_race=temp if ratio_`x'_just_race==.
	drop temp
	
	// just south 
	bysort south_merge: egen temp = max(ratio_`x'_just_south)
	replace ratio_`x'_just_south=temp if ratio_`x'_just_south==.
	drop temp
	
	// race x south
	bysort race south_merge: egen temp = max(ratio_`x'_just_race_south)
	replace ratio_`x'_just_race_south=temp if ratio_`x'_just_race_south==.
	drop temp
	
	}	
	

* Merge in PH income
	merge m:1 fatheroccej using `ph', assert(3) nogen

* Adjust average income for each cell using ratios 

	sort fatheroccej race region south_merge edu

	local inc "incwage" //note: HH income ratios can be used too
	
	foreach x in byrace bysouth byr_bys byr_byreg byr_bys_edu  {
	
	display "`x'"
	
	gen income_PH_`x' = ratio_`inc'_`x' * occscore_PH
	gen income_PH_farmfix_`x' = ratio_`inc'_`x' * occscore_PH_farmfix
	
	}
	
	rename occscore_PH income_PH_byocc
	rename occscore_PH_farmfix income_PH_farmfix_byocc	
	
	//Rescale additional income measures that do not vary by occupation 
	foreach x in just_race just_south just_race_south {
	
	gen income_PH_`x' = ratio_`inc'_`x' * income_all
	
	}

	drop ratio* 
	
* Second version of by race x south x edu that uses constant scale factor within edu.

	gen income_PH_byr_bys_edu_v2 = income_PH_byr_bys * scale_factor_`inc'
	gen income_PH_farmfix_byr_bys_edu_v2 = income_PH_farmfix_byr_bys * scale_factor_`inc'
	
	
* Keep only relevant variables
	keep race fatheroccej region south_merge edu *farmfix* *just*
	
/* Convert income measures from 1900$ into 1950 dollars: 
   https://www.minneapolisfed.org/about-us/monetary-policy/inflation-calculator/consumer-price-index-1800- 
*/
	gen CPI1950 = 72.3
	gen CPI1900 = 25
	
	foreach var of varlist *farmfix* *just* {
	
	display "`var'"
	
	replace `var' = `var' * (CPI1950/CPI1900)
	
	}
	drop CPI*
	
* Label some variables
	label var income_PH_farmfix_byocc "1901 PH, original"
	label var income_PH_farmfix_byrace "1901 PH, adjusted by race"   
	label var income_PH_farmfix_bysouth "1901 PH, adjusted by south"
	label var income_PH_farmfix_byr_bys "1901 PH, adjusted by race x south " 
	label var income_PH_farmfix_byr_byreg "1901 PH, adjusted by race x region" 
	label var income_PH_farmfix_byr_bys_edu "1901 PH, adjusted by race x south x edu, using ratios" 
	label var income_PH_farmfix_byr_bys_edu_v2 "1901 PH, adjusted by race x south x edu, using scale factor" 
	
	label var income_PH_just_race "1901 income, all by race"
	label var income_PH_just_south "1901 income, all by south" 
	label var income_PH_just_race_south "1901 income, all by race x south"
	
* Rename for future merge with survey data
	rename region region_merge 
	
	compress
	save ./output/1_PH1901_incomescores.dta, replace
	